require(quanteda)
require(LSX)
source("functions.R")

toks <- readRDS("data_tokens_he.RDS") %>% 
    tokens_remove('\\d', valuetype = 'regex', min_nchar = 3) %>% 
    tokens_remove(newsmap::data_dictionary_newsmap_he)

# NOTE: disabled because this function changed its behavior
# nuke <- char_context(toks, dict_he$targets, window = 20, min_count = 5) # min_count is half of JA

dfmt <- dfm(toks, remove_padding = TRUE) %>% 
    dfm_trim(min_termfreq = 5)

seed <- as.seedwords(dict_he$seeds, upper = 1, lower = 2)
# lss <- textmodel_lss(dfmt, seed, nuke, cache = TRUE) # NOTE: when char_keyness was used
lss <- textmodel_lss(dfmt, seed, cache = TRUE)
saveRDS(lss, "lss_he.RDS")

dfmt_group <- dfm_group(dfmt)
dat <- docvars(dfmt_group)
pred <- predict(lss, newdat = dfmt_group, density = TRUE)

dat$lss <- pred$fit
dat$density <- pred$density
dat$lss[dat$density < quantile(dat$density, 0.25)] <- NA
dat$lss <- as.numeric(scale(dat$lss))
saveRDS(dat, "data_lss_he.RDS")
